import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.offline as pyo
%matplotlib inline
plt.style.use('seaborn')
pyo.init_notebook_mode()
from time import perf_counter
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, QuantileTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_percentage_error
In addition to the already imported libraries we are going to use some additional functions and class.
The first class GridSearchResults is imported from https://www.davidsbatista.net/blog/2018/02/23/model_optimization/
It is a useful class to extract the results of a grid search and save it as a dataframe
class GridSearchResults:
def __init__(self, models, params):
if not set(models.keys()).issubset(set(params.keys())):
missing_params = list(set(models.keys()) - set(params.keys()))
raise ValueError("Some estimators are missing parameters: %s" % missing_params)
self.models = models
self.params = params
self.keys = models.keys()
self.grid_searches = {}
self.time = {}
def fit(self, X, y, cv=10, n_jobs=-1, verbose=1, scoring=None, refit=False):
for key in self.keys:
print("Running GridSearchCV for %s." % key)
model = self.models[key]
params = self.params[key]
gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
verbose=verbose, scoring=scoring, refit=refit,
return_train_score=True)
start = perf_counter()
gs.fit(X,y)
end = perf_counter()
elapsed = round(end - start, 3)
print(f"Grid Search took {elapsed} seconds for {key}\n")
self.grid_searches[key] = gs
self.time[key] = elapsed
def score_summary(self, sort_by='mean_score'):
def row(key, scores, params):
d = {
'estimator': key,
'min_score': min(scores),
'max_score': max(scores),
'mean_score': np.mean(scores),
'std_score': np.std(scores),
}
return pd.Series({**params,**d})
rows = []
for k in self.grid_searches:
print(k)
params = self.grid_searches[k].cv_results_['params']
scores = []
for i in range(self.grid_searches[k].cv):
key = "split{}_test_score".format(i)
r = self.grid_searches[k].cv_results_[key]
scores.append(r.reshape(len(params),1))
all_scores = np.hstack(scores)
for p, s in zip(params,all_scores):
rows.append((row(k, s, p)))
df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
columns = columns + [c for c in df.columns if c not in columns]
return df[columns]
In addition we are going to create four small functions to help us build pipelines
Before mesuring the error, we must be careful. We cannot measure the error of the transformed target without doing the inverse transform first
def create_pipeline(model, model_name, scaler):
preprocess = make_column_transformer(
(scaler, num_columns),
remainder="passthrough"
)
steps = [
("preprocess", preprocess),
(model_name, model)
]
pipeline = Pipeline(steps=steps)
return pipeline
def create_log_pipeline(model, model_name, scaler):
preprocess = make_column_transformer(
(scaler, num_columns),
remainder="passthrough"
)
regressor = TransformedTargetRegressor(
regressor = model,
func = np.log,
inverse_func = np.exp
)
steps = [
("preprocess", preprocess),
(model_name, regressor)
]
pipeline = Pipeline(steps=steps)
return pipeline
def create_power_pipeline(model, model_name, scaler):
from sklearn.preprocessing import PowerTransformer
preprocess = make_column_transformer(
(scaler, num_columns),
remainder="passthrough"
)
regressor = TransformedTargetRegressor(
regressor = model,
transformer = PowerTransformer(),
)
steps = [
("preprocess", preprocess),
(model_name, regressor)
]
pipeline = Pipeline(steps=steps)
return pipeline
df = pd.read_csv("seattle_clean2.csv")
df.sample(10)
| OSEBuildingID | BuildingType | PropertyName | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | LargestPropertyUseType | ... | SiteEnergyUse(kBtu) | TotalGHGEmissions | GHGEmissionsIntensity | SteamRatio | ElectricityRatio | NaturalGasRatio | SurfacePerBuilding | SurfacePerFloor | ParkingRatio | BuildingRatio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 29 | 65 | NonResidential | Inn at the Market | 1985 | 1.0 | 7 | 71150 | 0 | 71150 | Hotel | ... | 5.348309e+06 | 134.38 | 1.89 | 0.00000 | 0.606528 | 0.393472 | 71150.0 | 10164.285714 | 0.000000 | 1.000000 |
| 17 | 35 | NonResidential | Hotel Five | 1978 | 1.0 | 5 | 68410 | 16200 | 52210 | Hotel | ... | 4.456714e+06 | 128.44 | 1.88 | 0.00000 | 0.526475 | 0.473525 | 52210.0 | 10442.000000 | 0.236807 | 0.763193 |
| 808 | 24908 | School | Salmon Bay K-8 | 1931 | 1.0 | 3 | 117116 | 0 | 117116 | K-12 School | ... | 5.722326e+06 | 256.31 | 2.19 | 0.00000 | 0.180285 | 0.819715 | 117116.0 | 39038.666667 | 0.000000 | 1.000000 |
| 298 | 633 | NonResidential | 83 S KING ST BLDG (ID633) | 1904 | 1.0 | 7 | 184322 | 0 | 184322 | Office | ... | 9.722890e+06 | 67.78 | 0.37 | 0.00000 | 1.000000 | 0.000000 | 184322.0 | 26331.714286 | 0.000000 | 1.000000 |
| 913 | 27562 | NonResidential | MVI 7th Avenue | 1974 | 1.0 | 1 | 43380 | 0 | 43380 | Warehouse | ... | 4.050620e+05 | 2.82 | 0.07 | 0.00000 | 1.000000 | 0.000000 | 43380.0 | 43380.000000 | 0.000000 | 1.000000 |
| 659 | 22818 | NonResidential | Jack Warehouse | 1980 | 1.0 | 1 | 24100 | 0 | 24100 | Warehouse | ... | 8.098708e+05 | 30.49 | 1.27 | 0.00000 | 0.335250 | 0.664750 | 24100.0 | 24100.000000 | 0.000000 | 1.000000 |
| 830 | 25567 | NonResidential | 1109 N NORTHLAKE WAY | 1938 | 1.0 | 1 | 23050 | 0 | 23050 | Warehouse | ... | 4.666727e+05 | 3.25 | 0.14 | 0.00000 | 1.000001 | 0.000000 | 23050.0 | 23050.000000 | 0.000000 | 1.000000 |
| 485 | 20372 | NonResidential | Hotel Andra | 1925 | 1.0 | 9 | 104000 | 0 | 104000 | Hotel | ... | 8.690648e+06 | 355.80 | 3.42 | 0.41177 | 0.478655 | 0.109575 | 104000.0 | 11555.555556 | 0.000000 | 1.000000 |
| 629 | 21964 | NonResidential | (ID21964) BALLARD BAPTIST CHURCH | 1910 | 1.0 | 2 | 21765 | 0 | 21765 | Worship Facility | ... | 4.313904e+05 | 17.38 | 0.80 | 0.00000 | 0.277989 | 0.722010 | 21765.0 | 10882.500000 | 0.000000 | 1.000000 |
| 217 | 495 | NonResidential | 4245 Roosevelt 2ros4245 | 1994 | 1.0 | 4 | 191276 | 95281 | 95995 | Medical Office | ... | 8.422861e+06 | 77.99 | 0.41 | 0.00000 | 0.950424 | 0.049575 | 95995.0 | 23998.750000 | 0.498134 | 0.501866 |
10 rows × 22 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 991 entries, 0 to 990 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 OSEBuildingID 991 non-null int64 1 BuildingType 991 non-null object 2 PropertyName 991 non-null object 3 YearBuilt 991 non-null int64 4 NumberofBuildings 991 non-null float64 5 NumberofFloors 991 non-null int64 6 PropertyGFATotal 991 non-null int64 7 PropertyGFAParking 991 non-null int64 8 PropertyGFABuilding(s) 991 non-null int64 9 LargestPropertyUseType 991 non-null object 10 ENERGYSTARScore 991 non-null float64 11 SiteEUI(kBtu/sf) 991 non-null float64 12 SiteEnergyUse(kBtu) 991 non-null float64 13 TotalGHGEmissions 991 non-null float64 14 GHGEmissionsIntensity 991 non-null float64 15 SteamRatio 991 non-null float64 16 ElectricityRatio 991 non-null float64 17 NaturalGasRatio 991 non-null float64 18 SurfacePerBuilding 991 non-null float64 19 SurfacePerFloor 991 non-null float64 20 ParkingRatio 991 non-null float64 21 BuildingRatio 991 non-null float64 dtypes: float64(13), int64(6), object(3) memory usage: 170.5+ KB
df['EnergyLog'] = df['SiteEUI(kBtu/sf)'].apply(np.log)
fig, ax = plt.subplots(1,2, figsize=(16,6), facecolor="#eaeaf2")
sns.histplot(data=df, x="SiteEUI(kBtu/sf)", ax=ax[0])
sns.histplot(data=df, x='EnergyLog', ax=ax[1], kde=True);
The distribution of the target column is skewed. In general, algorithm performs better with a normal distribution. We can apply a transform function to the data to approximate the normal distribution. Here we can see that the logarithm transform is doing a good job. In the notebook we are also going to try the power transform.
We must be careful of converting the data to their original form before measuring the error
Data Scaling is another form of data transformation we can use to improve the performance of the model. It works in two steps:
We are going to use two scaling method, standard and robust scaling.
Standard Scaling is the most commonly used scaling method. First the mean is calculated and substracted from the train set. Then the values are divided by the standard deviation of the train data. Robust Scaling is similar to standard scaling. However it uses the median and the interquartile range instead of the mean and the standard deviation. Because of this Robust scaling is generally having better performances in presence of outlier.
When the scaling is applied to the validation and test data. The values used for the calculation are those of the train data
names = df[['OSEBuildingID', 'PropertyName']].copy()
df.drop(['OSEBuildingID', 'PropertyName'], axis=1, inplace=True)
df = df.drop(['SiteEnergyUse(kBtu)', 'EnergyLog', 'TotalGHGEmissions', 'GHGEmissionsIntensity'], axis=1)
y = df["SiteEUI(kBtu/sf)"].copy()
X = df.drop("SiteEUI(kBtu/sf)", axis=1)
X = pd.get_dummies(X)
mask = X.dtypes=="object"
num_types = ['int64', 'float64']
cat_columns = X.columns[mask].tolist()
num_columns = X.select_dtypes(include=num_types).columns.tolist()
print(cat_columns)
print(num_columns)
print(len(cat_columns) + len(num_columns))
[] ['YearBuilt', 'NumberofBuildings', 'NumberofFloors', 'PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)', 'ENERGYSTARScore', 'SteamRatio', 'ElectricityRatio', 'NaturalGasRatio', 'SurfacePerBuilding', 'SurfacePerFloor', 'ParkingRatio', 'BuildingRatio'] 14
from pycaret.regression import *
To select the best algorithm, we are going to use PyCaret, a low code machine learning solution
It is also possible to tune the hyperparameter using PyCaret but we are going to do it with a gridsearch to also compare the different target transformation. When this notebook was written, PyCaret didn't completely support target transformation
s = setup(df, target="SiteEUI(kBtu/sf)", normalize=True, normalize_method='zscore',
session_id=7239) #session_id works similarly to sklearn random_state
| Description | Value | |
|---|---|---|
| 0 | Session id | 7239 |
| 1 | Target | SiteEUI(kBtu/sf) |
| 2 | Target type | Regression |
| 3 | Data shape | (991, 31) |
| 4 | Train data shape | (693, 31) |
| 5 | Test data shape | (298, 31) |
| 6 | Numeric features | 14 |
| 7 | Categorical features | 2 |
| 8 | Preprocess | True |
| 9 | Imputation type | simple |
| 10 | Numeric imputation | mean |
| 11 | Categorical imputation | mode |
| 12 | Maximum one-hot encoding | 25 |
| 13 | Encoding method | None |
| 14 | Normalize | True |
| 15 | Normalize method | zscore |
| 16 | Fold Generator | KFold |
| 17 | Fold Number | 10 |
| 18 | CPU Jobs | -1 |
| 19 | Use GPU | False |
| 20 | Log Experiment | False |
| 21 | Experiment Name | reg-default-name |
| 22 | USI | 8c5a |
best = compare_models()
| Model | MAE | MSE | RMSE | R2 | RMSLE | MAPE | TT (Sec) | |
|---|---|---|---|---|---|---|---|---|
| et | Extra Trees Regressor | 16.1470 | 744.5405 | 26.6593 | 0.8132 | 0.3600 | 0.3202 | 0.1930 |
| gbr | Gradient Boosting Regressor | 15.6418 | 707.0861 | 26.2730 | 0.8094 | 0.3683 | 0.3204 | 0.1410 |
| rf | Random Forest Regressor | 16.2013 | 780.5170 | 27.2657 | 0.7941 | 0.3588 | 0.3206 | 0.2710 |
| lasso | Lasso Regression | 17.9004 | 892.8924 | 29.1514 | 0.7674 | 0.4272 | 0.3964 | 0.0390 |
| br | Bayesian Ridge | 18.7502 | 933.5030 | 30.0103 | 0.7360 | 0.4335 | 0.4357 | 0.0410 |
| ridge | Ridge Regression | 18.8731 | 959.0366 | 30.4736 | 0.7106 | 0.4374 | 0.4391 | 0.0410 |
| en | Elastic Net | 22.1993 | 1470.8359 | 36.8266 | 0.6957 | 0.4730 | 0.5049 | 0.0400 |
| par | Passive Aggressive Regressor | 21.8871 | 1230.4942 | 34.1936 | 0.6808 | 0.5112 | 0.4529 | 0.0390 |
| dt | Decision Tree Regressor | 22.1643 | 1347.9442 | 35.5630 | 0.6769 | 0.4821 | 0.4135 | 0.0420 |
| knn | K Neighbors Regressor | 20.7092 | 1510.4372 | 37.3776 | 0.6739 | 0.4332 | 0.3857 | 0.1030 |
| lr | Linear Regression | 19.0723 | 1011.0819 | 31.2553 | 0.6560 | 0.4370 | 0.4424 | 1.2180 |
| huber | Huber Regressor | 17.9627 | 1052.4064 | 31.5784 | 0.6243 | 0.4249 | 0.3799 | 0.0550 |
| omp | Orthogonal Matching Pursuit | 26.0731 | 1572.8069 | 38.9319 | 0.5597 | 0.6101 | 0.7087 | 0.0410 |
| lightgbm | Light Gradient Boosting Machine | 25.5153 | 3261.1043 | 52.3993 | 0.4383 | 0.5378 | 0.5418 | 0.0910 |
| ada | AdaBoost Regressor | 36.3003 | 2044.0260 | 44.9417 | 0.4115 | 0.7938 | 1.2929 | 0.1010 |
| llar | Lasso Least Angle Regression | 36.6933 | 3491.6093 | 57.2765 | 0.2825 | 0.7607 | 1.0962 | 0.0410 |
| dummy | Dummy Regressor | 41.6761 | 5840.3785 | 72.7636 | -0.0415 | 0.8285 | 1.1971 | 0.0410 |
| lar | Least Angle Regression | 39.1224 | 8475.2532 | 61.5342 | -1.9011 | 0.6099 | 0.9194 | 0.0470 |
et = create_model('et')
plot_model(et)
plot_model(et, 'error')
plot_model(et, 'feature')
| MAE | MSE | RMSE | R2 | RMSLE | MAPE | |
|---|---|---|---|---|---|---|
| Fold | ||||||
| 0 | 13.5592 | 629.7592 | 25.0950 | 0.9071 | 0.2558 | 0.2249 |
| 1 | 16.6787 | 702.2626 | 26.5002 | 0.8182 | 0.4371 | 0.4393 |
| 2 | 17.4624 | 780.4370 | 27.9363 | 0.9082 | 0.3854 | 0.3606 |
| 3 | 15.0545 | 464.1016 | 21.5430 | 0.8591 | 0.4311 | 0.4346 |
| 4 | 14.7357 | 971.2941 | 31.1656 | 0.5338 | 0.4516 | 0.2642 |
| 5 | 19.4716 | 1153.7654 | 33.9671 | 0.9127 | 0.3696 | 0.3218 |
| 6 | 15.9831 | 693.8520 | 26.3411 | 0.8342 | 0.3219 | 0.2623 |
| 7 | 13.8932 | 434.9219 | 20.8548 | 0.5105 | 0.3798 | 0.3227 |
| 8 | 13.0689 | 449.6464 | 21.2049 | 0.9495 | 0.3103 | 0.2746 |
| 9 | 16.5107 | 790.8204 | 28.1215 | 0.8613 | 0.3406 | 0.2994 |
| Mean | 15.6418 | 707.0861 | 26.2730 | 0.8094 | 0.3683 | 0.3204 |
| Std | 1.8737 | 220.7460 | 4.1010 | 0.1486 | 0.0591 | 0.0686 |
s = setup(df, target="SiteEUI(kBtu/sf)", normalize=True, normalize_method='robust', session_id=7239)
| Description | Value | |
|---|---|---|
| 0 | Session id | 7239 |
| 1 | Target | SiteEUI(kBtu/sf) |
| 2 | Target type | Regression |
| 3 | Data shape | (991, 31) |
| 4 | Train data shape | (693, 31) |
| 5 | Test data shape | (298, 31) |
| 6 | Numeric features | 14 |
| 7 | Categorical features | 2 |
| 8 | Preprocess | True |
| 9 | Imputation type | simple |
| 10 | Numeric imputation | mean |
| 11 | Categorical imputation | mode |
| 12 | Maximum one-hot encoding | 25 |
| 13 | Encoding method | None |
| 14 | Normalize | True |
| 15 | Normalize method | robust |
| 16 | Fold Generator | KFold |
| 17 | Fold Number | 10 |
| 18 | CPU Jobs | -1 |
| 19 | Use GPU | False |
| 20 | Log Experiment | False |
| 21 | Experiment Name | reg-default-name |
| 22 | USI | 12f2 |
best = compare_models()
| Model | MAE | MSE | RMSE | R2 | RMSLE | MAPE | TT (Sec) | |
|---|---|---|---|---|---|---|---|---|
| gbr | Gradient Boosting Regressor | 15.6944 | 706.8327 | 26.2583 | 0.8121 | 0.3758 | 0.3222 | 0.1310 |
| et | Extra Trees Regressor | 16.2495 | 744.5013 | 26.7189 | 0.8114 | 0.3572 | 0.3182 | 0.2020 |
| rf | Random Forest Regressor | 16.2231 | 780.7522 | 27.3030 | 0.7940 | 0.3595 | 0.3217 | 0.2760 |
| ridge | Ridge Regression | 19.0328 | 1079.4048 | 31.8868 | 0.7430 | 0.4352 | 0.4182 | 0.0420 |
| lasso | Lasso Regression | 21.3636 | 1369.1566 | 35.7961 | 0.6851 | 0.4442 | 0.4269 | 0.0390 |
| br | Bayesian Ridge | 18.9329 | 988.4901 | 30.9197 | 0.6818 | 0.4384 | 0.4402 | 0.0430 |
| dt | Decision Tree Regressor | 22.1623 | 1352.0340 | 35.8879 | 0.6782 | 0.4720 | 0.4028 | 0.0400 |
| lr | Linear Regression | 19.0729 | 1011.1515 | 31.2564 | 0.6559 | 0.4379 | 0.4425 | 0.0390 |
| omp | Orthogonal Matching Pursuit | 26.0731 | 1572.8069 | 38.9319 | 0.5597 | 0.6101 | 0.7087 | 0.0420 |
| ada | AdaBoost Regressor | 35.9841 | 1985.5744 | 44.3397 | 0.4635 | 0.7919 | 1.2826 | 0.1040 |
| lightgbm | Light Gradient Boosting Machine | 25.0731 | 3184.6791 | 51.6755 | 0.4573 | 0.5621 | 0.5327 | 0.0740 |
| llar | Lasso Least Angle Regression | 36.6933 | 3491.6093 | 57.2765 | 0.2825 | 0.7607 | 1.0962 | 0.0400 |
| knn | K Neighbors Regressor | 30.0214 | 4524.0120 | 63.5991 | 0.1661 | 0.5846 | 0.5710 | 0.0420 |
| en | Elastic Net | 35.5292 | 5039.9800 | 66.7385 | 0.1481 | 0.7196 | 0.9267 | 0.0390 |
| dummy | Dummy Regressor | 41.6761 | 5840.3785 | 72.7636 | -0.0415 | 0.8285 | 1.1971 | 0.0390 |
| huber | Huber Regressor | 44.8896 | 7233.9096 | 80.3204 | -0.2875 | 1.5847 | 0.8024 | 0.0480 |
| lar | Least Angle Regression | 41.0316 | 9463.6887 | 64.3759 | -1.9570 | 0.6218 | 0.9770 | 0.0500 |
| par | Passive Aggressive Regressor | 37.8094 | 16283.0913 | 93.6760 | -5.0983 | 0.6348 | 0.7238 | 0.0400 |
gbr = create_model('gbr')
plot_model(gbr)
plot_model(gbr, 'error')
plot_model(gbr, 'feature')
| MAE | MSE | RMSE | R2 | RMSLE | MAPE | |
|---|---|---|---|---|---|---|
| Fold | ||||||
| 0 | 13.5592 | 629.7592 | 25.0950 | 0.9071 | 0.2558 | 0.2249 |
| 1 | 16.6787 | 702.2626 | 26.5002 | 0.8182 | 0.4371 | 0.4393 |
| 2 | 17.4538 | 778.7518 | 27.9061 | 0.9084 | 0.3854 | 0.3606 |
| 3 | 15.0545 | 464.1016 | 21.5430 | 0.8591 | 0.4311 | 0.4346 |
| 4 | 14.4779 | 901.3781 | 30.0230 | 0.5674 | 0.5094 | 0.2601 |
| 5 | 20.1021 | 1220.0132 | 34.9287 | 0.9077 | 0.3786 | 0.3335 |
| 6 | 16.0541 | 694.3984 | 26.3514 | 0.8341 | 0.3220 | 0.2632 |
| 7 | 13.9837 | 437.1955 | 20.9092 | 0.5080 | 0.3873 | 0.3319 |
| 8 | 13.0689 | 449.6464 | 21.2049 | 0.9495 | 0.3103 | 0.2746 |
| 9 | 16.5107 | 790.8204 | 28.1215 | 0.8613 | 0.3406 | 0.2994 |
| Mean | 15.6944 | 706.8327 | 26.2583 | 0.8121 | 0.3758 | 0.3222 |
| Std | 2.0138 | 227.4449 | 4.1634 | 0.1428 | 0.0691 | 0.0690 |
To evaluate the best model, we used the MAPE mean absolute percentage error. In simpler word, on average the prediction are x percent off. The MAPE was chosen because it is easier to explain to a general public compared to other metrics
robust and standard scaling seems to give similar results. We will try to improve the performance by tuning the hyperparameter for the two best performing algorythms, ExtraTrees and GradientBoosting
trees = ExtraTreesRegressor(random_state=42)
boost = GradientBoostingRegressor(random_state=42)
standard = StandardScaler()
robust = RobustScaler()
pipeline_trees_s = create_pipeline(trees, 'ExtraTrees', standard)
pipeline_trees_r = create_pipeline(trees, 'ExtraTrees', robust)
pipeline_trees_s_log = create_log_pipeline(trees, 'ExtraTrees', standard)
pipeline_trees_r_log = create_log_pipeline(trees, 'ExtraTrees', robust)
pipeline_trees_s_power = create_power_pipeline(trees, 'ExtraTrees', standard)
pipeline_trees_r_power = create_power_pipeline(trees, 'ExtraTrees', robust)
#pipeline_trees_s_quantile = create_quantile_pipeline(trees, 'ExtraTrees', standard)
#pipeline_trees_r_quantile = create_quantile_pipeline(trees, 'ExtraTrees', robust)
models = {
"ExtraTrees_Standard": pipeline_trees_s,
"ExtraTrees_Robust": pipeline_trees_r,
"ExtraTrees_Standard_Log": pipeline_trees_s_log,
"ExtraTrees_Robust_Log": pipeline_trees_r_log,
"ExtraTrees_Standard_Power": pipeline_trees_s_power,
"ExtraTrees_Robust_Power": pipeline_trees_r_power,
# "ExtraTrees_Standard_Quantile": pipeline_trees_s_quantile,
# "ExtraTrees_Robust_Quantile": pipeline_trees_r_quantile,
}
params = {
"ExtraTrees_Standard": {"ExtraTrees__n_estimators": [64, 128]},
"ExtraTrees_Robust": {"ExtraTrees__n_estimators": [64, 128]},
"ExtraTrees_Standard_Log": {"ExtraTrees__regressor__n_estimators": [64, 128]},
"ExtraTrees_Robust_Log": {"ExtraTrees__regressor__n_estimators": [64, 128]},
"ExtraTrees_Standard_Power": {"ExtraTrees__regressor__n_estimators": [64, 128]},
"ExtraTrees_Robust_Power": {"ExtraTrees__regressor__n_estimators": [64, 128]},
# "ExtraTrees_Standard_Quantile": {"ExtraTrees__regressor__n_estimators": [64, 128, 256]},
# "ExtraTrees_Robust_Quantile": {"ExtraTrees__regressor__n_estimators": [64, 128, 256]},
}
grid_search = GridSearchResults(models, params)
grid_search.fit(X, y, scoring='neg_mean_absolute_percentage_error')
grid_search.score_summary()
Running GridSearchCV for ExtraTrees_Standard. Fitting 10 folds for each of 4 candidates, totalling 40 fits Grid Search took 60.683 seconds for ExtraTrees_Standard Running GridSearchCV for ExtraTrees_Robust. Fitting 10 folds for each of 4 candidates, totalling 40 fits Grid Search took 38.826 seconds for ExtraTrees_Robust Running GridSearchCV for ExtraTrees_Standard_Log. Fitting 10 folds for each of 4 candidates, totalling 40 fits Grid Search took 35.987 seconds for ExtraTrees_Standard_Log Running GridSearchCV for ExtraTrees_Robust_Log. Fitting 10 folds for each of 4 candidates, totalling 40 fits Grid Search took 40.536 seconds for ExtraTrees_Robust_Log Running GridSearchCV for ExtraTrees_Standard_Power. Fitting 10 folds for each of 4 candidates, totalling 40 fits Grid Search took 40.586 seconds for ExtraTrees_Standard_Power Running GridSearchCV for ExtraTrees_Robust_Power. Fitting 10 folds for each of 4 candidates, totalling 40 fits Grid Search took 41.584 seconds for ExtraTrees_Robust_Power ExtraTrees_Standard ExtraTrees_Robust ExtraTrees_Standard_Log ExtraTrees_Robust_Log ExtraTrees_Standard_Power ExtraTrees_Robust_Power
| estimator | min_score | mean_score | max_score | std_score | ExtraTrees__criterion | ExtraTrees__n_estimators | ExtraTrees__regressor__criterion | ExtraTrees__regressor__n_estimators | |
|---|---|---|---|---|---|---|---|---|---|
| 13 | ExtraTrees_Robust_Log | -0.395684 | -0.275236 | -0.13775 | 0.070466 | NaN | NaN | squared_error | 128 |
| 15 | ExtraTrees_Robust_Log | -0.399915 | -0.276116 | -0.138315 | 0.069731 | NaN | NaN | absolute_error | 128 |
| 17 | ExtraTrees_Standard_Power | -0.392464 | -0.276613 | -0.137008 | 0.068797 | NaN | NaN | squared_error | 128 |
| 11 | ExtraTrees_Standard_Log | -0.417219 | -0.276815 | -0.139842 | 0.072682 | NaN | NaN | absolute_error | 128 |
| 19 | ExtraTrees_Standard_Power | -0.413741 | -0.277071 | -0.142499 | 0.07 | NaN | NaN | absolute_error | 128 |
| 12 | ExtraTrees_Robust_Log | -0.39883 | -0.2771 | -0.137367 | 0.072432 | NaN | NaN | squared_error | 64 |
| 9 | ExtraTrees_Standard_Log | -0.39556 | -0.27722 | -0.142753 | 0.069032 | NaN | NaN | squared_error | 128 |
| 21 | ExtraTrees_Robust_Power | -0.403067 | -0.277788 | -0.139129 | 0.07084 | NaN | NaN | squared_error | 128 |
| 20 | ExtraTrees_Robust_Power | -0.404028 | -0.277854 | -0.144303 | 0.070803 | NaN | NaN | squared_error | 64 |
| 18 | ExtraTrees_Standard_Power | -0.413959 | -0.277901 | -0.140911 | 0.071145 | NaN | NaN | absolute_error | 64 |
| 16 | ExtraTrees_Standard_Power | -0.398422 | -0.277993 | -0.138416 | 0.07028 | NaN | NaN | squared_error | 64 |
| 14 | ExtraTrees_Robust_Log | -0.397139 | -0.278307 | -0.140196 | 0.069019 | NaN | NaN | absolute_error | 64 |
| 23 | ExtraTrees_Robust_Power | -0.411475 | -0.278509 | -0.140951 | 0.071369 | NaN | NaN | absolute_error | 128 |
| 22 | ExtraTrees_Robust_Power | -0.41137 | -0.279027 | -0.143914 | 0.071307 | NaN | NaN | absolute_error | 64 |
| 8 | ExtraTrees_Standard_Log | -0.392155 | -0.279091 | -0.147588 | 0.066529 | NaN | NaN | squared_error | 64 |
| 10 | ExtraTrees_Standard_Log | -0.420494 | -0.280431 | -0.138821 | 0.076865 | NaN | NaN | absolute_error | 64 |
| 5 | ExtraTrees_Robust | -0.426859 | -0.31037 | -0.16295 | 0.076867 | squared_error | 128 | NaN | NaN |
| 4 | ExtraTrees_Robust | -0.437431 | -0.311909 | -0.163214 | 0.080426 | squared_error | 64 | NaN | NaN |
| 3 | ExtraTrees_Standard | -0.485899 | -0.314438 | -0.162477 | 0.088088 | absolute_error | 128 | NaN | NaN |
| 1 | ExtraTrees_Standard | -0.456897 | -0.31455 | -0.154318 | 0.081718 | squared_error | 128 | NaN | NaN |
| 2 | ExtraTrees_Standard | -0.498638 | -0.316153 | -0.159994 | 0.092293 | absolute_error | 64 | NaN | NaN |
| 0 | ExtraTrees_Standard | -0.467071 | -0.317274 | -0.150269 | 0.083295 | squared_error | 64 | NaN | NaN |
| 7 | ExtraTrees_Robust | -0.478303 | -0.318025 | -0.159961 | 0.087949 | absolute_error | 128 | NaN | NaN |
| 6 | ExtraTrees_Robust | -0.468311 | -0.318422 | -0.156534 | 0.088702 | absolute_error | 64 | NaN | NaN |
#grid_search = GridSearchResults(models, params)
#grid_search.fit(X, y, scoring='r2')
#grid_search.score_summary()
pipeline_boost_s = create_pipeline(boost, 'GradientBoosting', standard)
pipeline_boost_r = create_pipeline(boost, 'GradientBoosting', robust)
pipeline_boost_s_log = create_log_pipeline(boost, 'GradientBoosting', standard)
pipeline_boost_r_log = create_log_pipeline(boost, 'GradientBoosting', robust)
pipeline_boost_s_power = create_power_pipeline(boost, 'GradientBoosting', standard)
pipeline_boost_r_power = create_power_pipeline(boost, 'GradientBoosting', robust)
#pipeline_boost_s_quantile = create_quantile_pipeline(boost, 'GradientBoosting', standard)
#pipeline_boost_r_quantile = create_quantile_pipeline(boost, 'GradientBoosting', robust)
models = {
"GradientBoosting_Standard": pipeline_boost_s,
"GradientBoosting_Robust": pipeline_boost_r,
"GradientBoosting_Standard_Log": pipeline_boost_s_log,
"GradientBoosting_Robust_Log": pipeline_boost_r_log,
"GradientBoosting_Standard_Power": pipeline_boost_s_power,
"GradientBoosting_Robust_Power": pipeline_boost_r_power,
# "GradientBoosting_Standard_Quantile": pipeline_boost_s_quantile,
# "GradientBoosting_Robust_Quantile": pipeline_boost_r_quantile,
}
params = {
"GradientBoosting_Standard": {"GradientBoosting__n_estimators": [64, 128, 256],
# "GradientBoosting__learning_rate": [0.05, 0.1, 0.2]
},
"GradientBoosting_Robust": {"GradientBoosting__n_estimators": [64, 128, 256],
# "GradientBoosting__learning_rate": [0.05, 0.1, 0.2]
},
"GradientBoosting_Standard_Log": {"GradientBoosting__regressor__n_estimators": [64, 128, 256],
# "GradientBoosting__regressor__learning_rate": [0.05, 0.1, 0.2]
},
"GradientBoosting_Robust_Log": {"GradientBoosting__regressor__n_estimators": [64, 128, 256],
# "GradientBoosting__regressor__learning_rate": [0.05, 0.1, 0.2]
},
"GradientBoosting_Standard_Power": {"GradientBoosting__regressor__n_estimators": [64, 128, 256],
# "GradientBoosting__regressor__learning_rate": [0.05, 0.1, 0.2]
},
"GradientBoosting_Robust_Power": {"GradientBoosting__regressor__n_estimators": [64, 128, 256],
# "GradientBoosting__regressor__learning_rate": [0.05, 0.1, 0.2]
},
"GradientBoosting_Standard_Quantile": {"GradientBoosting__regressor__n_estimators": [64, 128, 256],
# "GradientBoosting__regressor__learning_rate": [0.05, 0.1, 0.2]
},
"GradientBoosting_Robust_Quantile": {"GradientBoosting__regressor__n_estimators": [64, 128, 256],
# "GradientBoosting__regressor__learning_rate": [0.05, 0.1, 0.2]
},
}
grid_search = GridSearchResults(models, params)
grid_search.fit(X, y, scoring='neg_mean_absolute_percentage_error')
grid_search.score_summary()
Running GridSearchCV for GradientBoosting_Standard. Fitting 10 folds for each of 3 candidates, totalling 30 fits Grid Search took 6.889 seconds for GradientBoosting_Standard Running GridSearchCV for GradientBoosting_Robust. Fitting 10 folds for each of 3 candidates, totalling 30 fits Grid Search took 2.755 seconds for GradientBoosting_Robust Running GridSearchCV for GradientBoosting_Standard_Log. Fitting 10 folds for each of 3 candidates, totalling 30 fits Grid Search took 2.704 seconds for GradientBoosting_Standard_Log Running GridSearchCV for GradientBoosting_Robust_Log. Fitting 10 folds for each of 3 candidates, totalling 30 fits Grid Search took 2.762 seconds for GradientBoosting_Robust_Log Running GridSearchCV for GradientBoosting_Standard_Power. Fitting 10 folds for each of 3 candidates, totalling 30 fits Grid Search took 2.708 seconds for GradientBoosting_Standard_Power Running GridSearchCV for GradientBoosting_Robust_Power. Fitting 10 folds for each of 3 candidates, totalling 30 fits Grid Search took 2.795 seconds for GradientBoosting_Robust_Power GradientBoosting_Standard GradientBoosting_Robust GradientBoosting_Standard_Log GradientBoosting_Robust_Log GradientBoosting_Standard_Power GradientBoosting_Robust_Power
| estimator | min_score | mean_score | max_score | std_score | GradientBoosting__n_estimators | GradientBoosting__regressor__n_estimators | |
|---|---|---|---|---|---|---|---|
| 7 | GradientBoosting_Standard_Log | -0.353597 | -0.256239 | -0.133433 | 0.061251 | NaN | 128 |
| 16 | GradientBoosting_Robust_Power | -0.345161 | -0.256595 | -0.148967 | 0.058497 | NaN | 128 |
| 13 | GradientBoosting_Standard_Power | -0.348172 | -0.256803 | -0.142765 | 0.060649 | NaN | 128 |
| 10 | GradientBoosting_Robust_Log | -0.342296 | -0.257707 | -0.141152 | 0.059409 | NaN | 128 |
| 14 | GradientBoosting_Standard_Power | -0.35281 | -0.261338 | -0.14555 | 0.063703 | NaN | 256 |
| 17 | GradientBoosting_Robust_Power | -0.359828 | -0.261887 | -0.151287 | 0.062173 | NaN | 256 |
| 11 | GradientBoosting_Robust_Log | -0.351858 | -0.263907 | -0.143806 | 0.06066 | NaN | 256 |
| 8 | GradientBoosting_Standard_Log | -0.363138 | -0.264372 | -0.136835 | 0.063491 | NaN | 256 |
| 9 | GradientBoosting_Robust_Log | -0.344034 | -0.26585 | -0.146778 | 0.058828 | NaN | 64 |
| 6 | GradientBoosting_Standard_Log | -0.347502 | -0.266671 | -0.142378 | 0.0604 | NaN | 64 |
| 12 | GradientBoosting_Standard_Power | -0.346392 | -0.268816 | -0.151684 | 0.059036 | NaN | 64 |
| 15 | GradientBoosting_Robust_Power | -0.349063 | -0.269242 | -0.157284 | 0.05898 | NaN | 64 |
| 2 | GradientBoosting_Standard | -0.421588 | -0.298475 | -0.148716 | 0.083197 | 256 | NaN |
| 1 | GradientBoosting_Standard | -0.426392 | -0.300771 | -0.152358 | 0.080342 | 128 | NaN |
| 5 | GradientBoosting_Robust | -0.430349 | -0.303856 | -0.148915 | 0.087544 | 256 | NaN |
| 4 | GradientBoosting_Robust | -0.441701 | -0.305853 | -0.154889 | 0.085116 | 128 | NaN |
| 0 | GradientBoosting_Standard | -0.465994 | -0.331481 | -0.173747 | 0.090547 | 64 | NaN |
| 3 | GradientBoosting_Robust | -0.488718 | -0.33582 | -0.180679 | 0.092815 | 64 | NaN |
#grid_search = GridSearchResults(models, params)
#grid_search.fit(X, y, scoring='r2')
#grid_search.score_summary()
Log and Power Transform greatly improve the results.
Compared to ExtraTrees, GradientBoosting is faster and gives better predictions. It will be our final model
Standard and Robust scaling gives similar results. Our models seems to perform better with 128 than 256 estimators. The difference is small between the two parameters. Therefore we will use 128 estimator to reduce the necessary computing time.
from sklearn.model_selection import cross_val_predict
model = GradientBoostingRegressor(n_estimators=128)
pipeline = create_power_pipeline(model, 'GradientBoosting', StandardScaler())
y_pred = cross_val_predict(pipeline, X, y, cv=10, n_jobs=-1)
df["PropertyName"] = names['PropertyName']
df["Predictions"] = y_pred
| PropertyName | SiteEUI(kBtu/sf) | Predictions | |
|---|---|---|---|
| 168 | Century Square - COS Compliance | 78.099998 | 71.327441 |
| 942 | Villa Academy - Main bldg. | 38.299999 | 37.463056 |
| 146 | Puget Sound Plaza | 90.699997 | 66.607551 |
| 484 | Emerald Landing I | 48.000000 | 58.601471 |
| 25 | 5th and Pine | 56.299999 | 57.155819 |
| 907 | West Seattle Thriftway | 273.700012 | 259.471079 |
| 658 | Bliss Hall | 48.500000 | 54.779727 |
| 24 | Seattle Hilton Hotel | 46.400002 | 70.140601 |
| 321 | Mutual Life Building | 54.299999 | 61.388507 |
| 205 | City Place II - SEDO | 42.400002 | 42.247619 |
| 153 | Ferguson Terminal | 72.199997 | 74.626835 |
| 975 | Paul G Allen Athletic Center | 26.200001 | 40.738686 |
| 571 | Fourth and Union | 132.800003 | 79.217579 |
| 531 | Northlake Plaza | 42.799999 | 42.152489 |
| 271 | 1260 Mercer | 62.400002 | 43.606848 |
| 793 | Guardian Security Systems | 41.099998 | 33.361377 |
| 463 | NW BLDG | 124.000000 | 78.483915 |
| 752 | WES 2233 LLC | 11.900000 | 16.767477 |
| 283 | Safeway 1965 - Rainier Ave | 241.300003 | 251.571520 |
| 591 | E0010 - Elliott Court | 41.299999 | 39.218049 |
| 717 | Planned Parenthood Western | 64.400002 | 63.563712 |
| 350 | Gensco Incorporated | 14.300000 | 12.566163 |
| 673 | Central Campus | 173.399994 | 87.324485 |
| 696 | 2345 Eastlake | 59.200001 | 47.298078 |
| 791 | 820 | 28.600000 | 32.724540 |
plt.figure(figsize=(16,12))
sns.lineplot(x=[0,800], y=[0,800], linestyle='--', color='red')
sns.scatterplot(data=df, x='SiteEUI(kBtu/sf)', y='Predictions');
df['ErrorPercentage'] = 100 * (df['Predictions'] - df['SiteEUI(kBtu/sf)']) / df['SiteEUI(kBtu/sf)']
text = f"Distribution de l'erreur en %"
plt.figure(figsize=(12,6))
sns.histplot(data=df, x='ErrorPercentage')
plt.title(text)
plt.xlim([-100,300]);
df['AbsoluteError'] = df['ErrorPercentage'].apply(np.abs)
median_error = round(df['AbsoluteError'].median(), 2)
mean_error = round(df['AbsoluteError'].mean(),2 )
q_25 = np.quantile(df['AbsoluteError'], 0.25)
q_75 = np.quantile(df['AbsoluteError'], 0.75)
q_90 = np.quantile(df['AbsoluteError'], 0.90)
text = f"Distribution de l'erreur en %:\nMoyenne: {mean_error} Mediane: {median_error}"
text += f"\n25%: {round(q_25,2)}, 75%: {round(q_75,2)}, 90%: {round(q_90, 2)}"
plt.figure(figsize=(12,6))
sns.histplot(data=df, x='AbsoluteError')
plt.title(text)
plt.xlim([-5,300]);
categories_error = df.groupby('LargestPropertyUseType')[['AbsoluteError']].median().reset_index()
categories_error.sort_values(by='AbsoluteError', inplace=True, ascending=False)
plt.figure(figsize=(16,12))
sns.barplot(data=categories_error, y='LargestPropertyUseType', x='AbsoluteError', color='gray');
X_e = X.drop(['ENERGYSTARScore'], axis=1)
X_e.columns
num_columns = X_e.select_dtypes(include=num_types).columns.tolist()
pipeline = create_power_pipeline(model, 'GradientBoosting', StandardScaler())
y_pred_e = cross_val_predict(pipeline, X_e, y, cv=10, n_jobs=-1)
df['pred_without_energy'] = y_pred_e
plt.figure(figsize=(16,12))
sns.lineplot(x=[0,800], y=[0,800], linestyle='--', color='red')
sns.scatterplot(data=df, x='SiteEUI(kBtu/sf)', y='pred_without_energy');
df['ErrorPercentage(W/out)'] = 100 * (df['pred_without_energy'] - df['SiteEUI(kBtu/sf)']) / df['SiteEUI(kBtu/sf)']
df['AbsoluteError(W/out)'] = np.abs(df['ErrorPercentage(W/out)'])
text = f"Distribution de l'erreur en %"
plt.figure(figsize=(12,6))
sns.histplot(data=df, x='ErrorPercentage(W/out)')
plt.title(text)
plt.xlim([-100,300]);
median_error = round(df['AbsoluteError(W/out)'].median(), 2)
mean_error = round(df['AbsoluteError(W/out)'].mean(),2 )
q_25 = np.quantile(df['AbsoluteError(W/out)'], 0.25)
q_75 = np.quantile(df['AbsoluteError(W/out)'], 0.75)
q_90 = np.quantile(df['AbsoluteError(W/out)'], 0.90)
text = f"Distribution de l'erreur en %:\nMoyenne: {mean_error} Mediane: {median_error}"
text += f"\n25%: {round(q_25,2)}, 75%: {round(q_75,2)}, 90%: {round(q_90, 2)}"
plt.figure(figsize=(12,6))
sns.histplot(data=df, x='AbsoluteError(W/out)')
plt.title(text)
plt.xlim([-5,300]);
categories_error = df.groupby('LargestPropertyUseType')[['AbsoluteError(W/out)']].median().reset_index()
categories_error.sort_values(by='AbsoluteError(W/out)', inplace=True, ascending=False)
plt.figure(figsize=(16,12))
sns.barplot(data=categories_error, y='LargestPropertyUseType', x='AbsoluteError(W/out)', color='gray');
median_per_category = df.groupby('LargestPropertyUseType')['SiteEUI(kBtu/sf)'].median().reset_index()
median_per_category
median_dic = {}
for index, row in median_per_category.iterrows():
b_type = row['LargestPropertyUseType']
value = row['SiteEUI(kBtu/sf)']
median_dic[b_type] = value
median_dic
{'Bank/Finance/Courthouse': 60.5,
'Data Center': 701.0,
'Hospital (General Medical & Surgical)': 221.05000305,
'Hotel': 79.450000765,
'K-12 School': 47.70000076,
'Medical Office': 78.90000153,
'Office': 52.90000153,
'Residence Hall/Dormitory': 54.79999924,
'Retail Store': 59.40000153,
'Senior Care Community': 99.350002315,
'Supermarket/Grocery Store': 248.8000031,
'Warehouse': 26.79999924,
'Worship Facility': 32.29999924}
df['CategoryMedian'] = df['LargestPropertyUseType'].map(median_dic)
df['ErrorMedianGuess'] = 100 * np.abs(df['SiteEUI(kBtu/sf)'] - df['CategoryMedian']) / df['SiteEUI(kBtu/sf)']
mean = round(df['ErrorMedianGuess'].mean(),3)
median = round(df['ErrorMedianGuess'].median(),3)
q_25 = np.quantile(df['ErrorMedianGuess'],0.25)
q_75 = np.quantile(df['ErrorMedianGuess'], 0.75)
q_90 = np.quantile(df['ErrorMedianGuess'], 0.90)
text = f"Distribution de l'erreur en %:\nMoyenne: {mean} Mediane: {median}"
text += f"\n25%: {round(q_25,2)}, 75%: {round(q_75,2)}, 90%: {round(q_90, 2)}"
plt.figure(figsize=(16,6))
plt.title(text)
sns.histplot(df, x='ErrorMedianGuess')
plt.xlim([-5,300]);
The model without ENERGYSTAR score does not perform better than using the median of each category to predict the energy consumption. Without the ENERGYSTAR score, machine learning is not needed
The Gradient Boosting is the algorithm giving us the best results, and with a computing time much lower than the RandomForest and the ExtraTrees algorithms
If the City of Seattle does not want to calculate the ENERGYSTAR score, it will be better to estimate the building consumption without Machine Learning. However the model we made in this notebook can be useful if the ENERGYSTAR score is used in the future